import os
import re
import subprocess

config_path = os.getenv(
    "ANCHORE_FRESHCLAM_CONFIG_FILE", "/home/anchore/clamav/freshclam.conf"
)
dbdir = os.getenv("ANCHORE_CLAMAV_DB_DIR", "/home/anchore/clamav/db")

CLAMAV_CMD = "clamscan --suppress-ok-results --infected --recursive --allmatch --archive-verbose --tempdir={tempdir} --database={database} --max-filesize=4000m --max-scansize=4000m"
REFRESH_CMD = "freshclam --stdout --datadir={database} --config-file={configfile}"
CLAMAV_FINDING_EXITCODE = 1
CLAMAV_NO_FINDING_EXITCODE = 0
UNKNOWN = "unknown"


class ScanFailedException(Exception):
    pass


class RefreshFailedException(Exception):
    pass


class CannotScanException(Exception):
    pass


def run_clamav(path_to_scan: str, tempdir: str, dbdir: str, cmdline_args: list) -> str:
    """
    Run the scan

    Example expected output for a finding from a tar:

    "t1.tar.gz: Unix.Trojan.MSShellcode-40 FOUND
    t1.tar.gz!POSIX_TAR:8615aed40f9faebd56f4dfd6af10e576e4fdc49af4f4507dd738c51ae18a7a55/layer.tar!POSIX_TAR:bin/busybox!...!(8)POSIX_TAR:elf_payload1: Unix.Trojan.MSShellcode-40 FOUND"

    :param path_to_scan: str path of file to scan
    :param tempdir: str dir path to use as tmp during scan
    :param dbdir: str path to clamav virus db
    :return:
    """
    clamscan_cmd = "{} {}".format(
        append_cmdline_args(
            CLAMAV_CMD.format(tempdir=tempdir, database=dbdir), cmdline_args
        ),
        path_to_scan,
    )
    if not is_squashed_tar_valid_for_scan(path_to_scan, clamscan_cmd):
        print(
            "Squashed tar {} is larger than configured clamscan parameters for filesize or scansize".format(
                path_to_scan
            )
        )
        raise CannotScanException("Input image is larger than configured max size")

    print("Running command {}".format(clamscan_cmd))
    status, output = subprocess.getstatusoutput(clamscan_cmd)
    if status in [CLAMAV_FINDING_EXITCODE, CLAMAV_NO_FINDING_EXITCODE]:
        return output
    else:
        raise ScanFailedException(
            "clamav execution failed. returned exit code {}. Output = {}".format(
                status, output
            )
        )


def is_squashed_tar_valid_for_scan(path_to_scan: str, clamscan_cmd: str) -> bool:
    """
    if value from max-filesize=<value> or max-scansize=<value> is smaller than the actual squashed tar,
    we should not attempt the scan
    """
    squashed_tar_size_bytes = os.path.getsize(path_to_scan)

    cmd_filesize = get_bytes_from_cmd_for_key(clamscan_cmd, "max-filesize")
    cmd_scansize = get_bytes_from_cmd_for_key(clamscan_cmd, "max-scansize")

    return (
        squashed_tar_size_bytes < cmd_filesize
        and squashed_tar_size_bytes < cmd_scansize
    )


def get_bytes_from_cmd_for_key(clamscan_cmd: str, param_key: str) -> int:
    regex = r"{}=(.*?)(\s|$)".format(param_key)
    capture_group = re.search(regex, clamscan_cmd).group(1)

    # result is in megabytes
    if capture_group.endswith("M") or capture_group.endswith("m"):
        bytes_value = int(capture_group[:-1])
        bytes_value *= 1000000
    else:
        # result is in kb
        bytes_value = int(capture_group)
        bytes_value *= 1000
    return bytes_value


def append_cmdline_args(cmd: str, cmdline_args: list) -> str:
    command = cmd
    # append or replace command line args
    for arg in cmdline_args:
        # If the arg format is x=y, then we will either replace the value if the key (x) exists or we will append
        # NOTE: If the arg format is just a key, we will always append
        arg_parts = arg.split("=")
        if len(arg_parts) > 1 and arg_parts[0] in command:
            replace_expr = r"{}=.*?(\s|$)".format(arg_parts[0])
            command = re.sub(replace_expr, "{} ".format(arg), command)
        else:
            command += " --{}".format(arg)
    return command.strip()


def refresh_clamavdb(databasedir: str, configfile: str, cmdline_args: list) -> str:
    """
    Run the refresh and return the output str

    :param databasedir: str path to where the clamav db resides
    :param configfile: str path to the config file
    :return: output of the command execution as a str
    """

    print("Refreshing ClamAV DB")
    cmd = append_cmdline_args(
        REFRESH_CMD.format(database=databasedir, configfile=configfile), cmdline_args
    )
    print("Running command {}".format(cmd))
    status, output = subprocess.getstatusoutput(cmd)
    if status == 0:
        return output
    else:
        # Testing only
        raise RefreshFailedException(
            "Could not refresh clamav db. returned exit code {}. Output = {}".format(
                status, output
            )
        )


def parse_refresh_output(output: str) -> dict:
    """
    Parse the refresh output to return a dict of db versions post-refresh
    :param output:
    :return:
    """

    # Example output:
    # Database test passed.
    # daily.cvd updated (version: 25889, sigs: 3716328, f - level: 63, builder: raynman)
    # main.cvd database is up to date (version: 59, sigs: 4564902, f - level: 60, builder: sigmgr)
    # bytecode.cvd database is up to date (version: 331, sigs: 94, f - level: 63, builder: anvilleg)

    daily_re = re.compile(r"\W*daily\.c[lv]d .*\(version: ([0-9]+),.*")
    main_re = re.compile(r"\W*main\.c[lv]d .*\(version: ([0-9]+),.*")
    bytecode_re = re.compile(r"\W*bytecode\.c[lv]d .*\(version: ([0-9]+),.*")

    daily_version = ""
    main_version = ""
    bytecode_version = ""

    for line in output.splitlines():
        m = daily_re.match(line)
        if m:
            daily_version = m.group(1)
            continue

        m = main_re.match(line)
        if m:
            main_version = m.group(1)
            continue

        m = bytecode_re.match(line)
        if m:
            bytecode_version = m.group(1)

    return {"daily": daily_version, "main": main_version, "bytecode": bytecode_version}


def parse_clamscan_output_line(scanned_file: str, line: str) -> tuple:
    """
    Parse a single line from the output

    :param scanned_file: full path of tarball scanned, for context
    :param line:
    :return:
    """
    regex = r"{}\!POSIX_TAR:.+\.\.\.\!\([0-9]\)POSIX_TAR:(.+):\s+(\S+)\s+FOUND$".format(
        re.escape(scanned_file)
    )
    line_match = re.match(regex, line)

    if line_match is not None:
        # Prefix the leading slash since that is omitted due to inspecting a tar, but the tar is a rootfs for an image
        path = "/" + line_match.groups()[-2]
        signature = line_match.groups()[-1]

        return path, signature

    # This is a failsafe if clamscan returns findings with an unexpected output
    # I.E. eicar.com image that is plaintext and not executable.
    regex = r"{}:\s+(.+)\s+FOUND$".format(re.escape(scanned_file))
    line_match = re.match(regex, line)
    if line_match is not None:
        return UNKNOWN, line_match.groups()[-1]

    return tuple()


def parse_clamscan(scanned_file: str, raw_output: str) -> list:
    """
    Split lines into k/v pairs.

    Example output:
    /analysis_scratch/a5346cac-802a-4cb9-a0c5-756b9b51858b/squashed.tar: Unix.Trojan.MSShellcode-40 FOUND
    /analysis_scratch/a5346cac-802a-4cb9-a0c5-756b9b51858b/squashed.tar!POSIX_TAR:bin/busybox!...!(8)POSIX_TAR:elf_payload1: Unix.Trojan.MSShellcode-40 FOUND"

    Each line is formatted as: `<path>: <signature> FOUND`

    With the --archive-verbose flag on clamscan, it will output the path within the archive that the item is found

    # returns list of dicts mapping path to signature e.g.
    # {
    # "/file": ["signature1", "signature2"]
    # }

    returns a list of objects [ {"path": "/somepath", "finding": <some signature for scanner>} ]

    :param scanned_file: str full path of file that was scanned
    :param raw_output:
    :return: list of dicts
    """

    summary_regex = re.compile("-+ SUMMARY -+")
    results = []

    undefined_path_signatures = set()
    defined_path_signatures = set()
    for line in raw_output.splitlines():
        if summary_regex.match(line):
            # Output complete
            break

        result = parse_clamscan_output_line(scanned_file, line)
        if result:
            path = result[0]
            signature = result[1]
            # Ensure that we have a unique set of signatures when the path is undefined (see parse_clamscan_output_line)
            if path == UNKNOWN and signature not in undefined_path_signatures:
                undefined_path_signatures.add(signature)
            elif path != UNKNOWN:
                results.append({"path": path, "signature": signature})
                defined_path_signatures.add(signature)

    # For each signature that doesn't have a defined path, append it.
    for signature in undefined_path_signatures.difference(defined_path_signatures):
        results.append({"path": UNKNOWN, "signature": signature})
    return results


class ScanResult:
    def __init__(self, name: str, metadata: dict, findings: list):
        self.findings = findings
        self.name = name
        self.metadata = metadata

    def to_json(self):
        return {
            "scanner": self.name,
            "findings": self.findings,
            "metadata": self.metadata,
        }


class MalwareScanner:
    """
    Generic class for implementing malware scanners that can be easily run the analyzer
    """

    name = None

    def __init__(self, config, squashed_tar_path, tempdir):
        """

        :param config: the config dict for this named scanner
        :param squashed_tar_path: the full file path to the squash.tar to be scanned
        :param tempdir: path to use as temp space during scans
        """
        self.config = config
        self.tar_path = squashed_tar_path
        self.tempdir = tempdir

        if self.config is not None and self.config.get(self.name) is not None:
            self.enabled = (
                self.config.get(self.name, {}).get("enabled", False)
                if self.tar_path is not None
                else False
            )
            self.refresh_enabled = self.config.get(self.name, {}).get(
                "db_update_enabled", True
            )
            self.clamscan_args = self.config.get(self.name, {}).get("clamscan_args", [])
            self.freshclam_args = self.config.get(self.name, {}).get(
                "freshclam_args", []
            )
        else:
            self.enabled = False
            self.refresh_enabled = True
            self.clamscan_args = []
            self.freshclam_args = []

    def run(self) -> ScanResult:
        raise NotImplementedError


class ClamAVRunner(MalwareScanner):
    name = "clamav"

    def scan(self) -> list:
        """
        Run the scan, returning a list of findings

        :return:
        """
        output = run_clamav(
            self.tar_path,
            tempdir=self.tempdir,
            dbdir=dbdir,
            cmdline_args=self.clamscan_args,
        )
        if not output:
            return []

        print("Raw ClamAV stdout: " + output)
        return parse_clamscan(self.tar_path, output)

    def refresh_db(self) -> dict:
        """
        Update the signature db, and return the versions
        :return: version dict
        """
        refresh_output = refresh_clamavdb(
            databasedir=dbdir, configfile=config_path, cmdline_args=self.freshclam_args
        )
        return parse_refresh_output(refresh_output)

    def run(self):
        print("running clamav scan")

        if self.enabled:
            # Run clamav scan
            if self.refresh_enabled:
                db_versions = self.refresh_db()
            else:
                db_versions = {}

            meta = {
                "db_version": db_versions,
                "db_update_enabled": self.refresh_enabled,
            }
            print("ClamAV scan metadata: {}".format(meta))

            output = self.scan()
            print("ClamAV findings: {}".format(output))
        else:
            output = None
            meta = None

        return ScanResult(name=self.name, metadata=meta, findings=output)
